R Markdown
# Custom functions defined here
# This function accepts a tibble as generated by the gutenberg_download() function.
# It produces a tibble containing bigrams with stop words filtered.
process_text_to_bigram <- function(t){ t %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% myth_stop_words$word) %>%
filter(!word2 %in% myth_stop_words$word) %>%
filter(!str_detect(word1, "\\d")) %>%
filter(!str_detect(word2, "\\d")) %>%
unite(bigram, word1, word2, sep = " ")
}
# Generates igraph object from counted bigrams. Optional second arg sets filter threshold to remove infrequent pairs.
graph_from_counts <- function(t, x=5){ t %>%
separate(bigram, c("word1", "word2")) %>%
filter(n > x) %>%
graph_from_data_frame()}
# Unnests raw gutenberg_download tibble by sentence.
text_to_sentence <- function(t) {
t %>%
unnest_tokens(sentences, text, token = "sentences") %>%
mutate(sentence_number = row_number())
}
# Isolates sentences to only those containing a thematic keyword defined in 'thematic keywords' vector.
# Returns thematic_lines tibble. If pull_nearby is TRUE, this function will call pull_nearby_lines
# to return the thematic sentences with the surrounding lines in the text.
isolate_thematic_sentences <- function(t, pull_nearby=FALSE) {
thematic_lines <- t %>%
select(sentences,sentence_number) %>%
filter(str_detect(t$sentences, thematic_keywords))
if (pull_nearby)
{
thematic_lines <- t %>% pull_nearby_lines(thematic_lines)
}
return(thematic_lines)
}
# Unnests sentence level tokenization to compare each word in a sentence with every other word in sentence
# filters to remove cases where the words are the same. Generates bigrams.
bigrams_by_sentence <- function(t) {
t <- t %>%
select(sentences,sentence_number) %>%
unnest_tokens(word,sentences)
t <- full_join(t,t, by="sentence_number") %>%
filter(!(word.x == word.y)) %>%
select(word.x,word.y) %>%
unite(bigram, word.x, word.y, sep=" ")
}
# Takes a bigram and filters out stop words, then isolates thematic keywords according to 'thematic keywords' vector
count_thematic_bigrams <- function(t) {
t %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% myth_stop_words$word) %>%
filter(!word2 %in% myth_stop_words$word) %>%
filter(!str_detect(word1, "\\d")) %>%
filter(!str_detect(word2, "\\d")) %>%
filter(word1 %in% thematic_keywords | word2 %in% thematic_keywords) %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram,sort="TRUE")
}
# Helper function to plot graph from igraph object with consistent styling.
plot_graph <- function(g) {
a <- grid::arrow(type = "closed", length = unit(.10, "inches"))
ggraph(g, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_label(aes(label = name), vjust = 1, hjust = 1, check_overlap = TRUE, repel=TRUE) +
theme_void()
}
# Given a tibble with a document tokenized by sentence, and a tibble with isolated lines, this function
# will return a new tibble with the isolated lines and their adjacent lines. The range of this
# is denoted by the n parameter.
pull_nearby_lines <- function(text,iso_lines,n=10) {
output = tibble()
text <- text %>% select(sentences, sentence_number)
for(x in iso_lines$sentence_number){
output<- text %>% filter(abs(x - sentence_number) < n) %>%
bind_rows(output)
}
output <- arrange(output, sentence_number)
return(output)
}
# Loading all libraries for analysis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(gutenbergr)
library(rvest)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
##
## guess_encoding
library(tidytext)
library(tidyr)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Loading required package: ggplot2
# Pulls necessary texts from Gutenberg or Archive.org where appropriate
gilgamesh_text <- read_html("https://archive.org/stream/TheEpicofGilgamesh_201606/eog_djvu.txt") %>%
html_node("main") %>% html_node("pre") %>% html_text() %>% read_lines() %>% as_tibble() %>%
mutate(Line = row_number()) %>% rename('text' = 'value')
ramayana_text <- gutenberg_download("24869", strip = TRUE)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
hercules_text <- gutenberg_download("50569", strip=TRUE)
aeneid_text <- gutenberg_download("228", strip = TRUE)
library(tidytext)
library(tidyr)
# Defining custom stop words to add to our dictionary
word <- c("thou","thee","thy")
lexicon <- c("CUSTOM","CUSTOM","CUSTOM")
custom_stop_words <- data.frame(word,lexicon)
myth_stop_words <- rbind(stop_words, custom_stop_words)
# Tokenizes texts and removes stop words
gilgamesh_bigrams <- process_text_to_bigram(gilgamesh_text)
ramayana_bigrams <- process_text_to_bigram(ramayana_text)
hercules_bigrams <- process_text_to_bigram(hercules_text)
aeneid_bigrams <- process_text_to_bigram(aeneid_text)
# Standard analysis to count the most common words that appear in a text
gilgamesh_counts <- gilgamesh_bigrams %>% count(bigram, sort=TRUE)
ramayana_counts <- ramayana_bigrams %>% count(bigram, sort=TRUE)
hercules_counts <- hercules_bigrams %>% count(bigram, sort=TRUE)
aeneid_counts <- aeneid_bigrams %>% count(bigram, sort=TRUE)
library(igraph)
# Creates necessary plot object to display words that appear commonly together
gilgamesh_graph <- graph_from_counts(gilgamesh_counts)
## Warning: Expected 2 pieces. Additional pieces discarded in 22 rows [91, 97, 149,
## 153, 375, 392, 466, 619, 635, 636, 742, 743, 766, 878, 879, 939, 1050, 1051,
## 1133, 1184, ...].
ramayana_graph <- graph_from_counts(ramayana_counts)
## Warning: Expected 2 pieces. Additional pieces discarded in 5325 rows [1, 7, 10,
## 17, 21, 25, 32, 41, 51, 55, 56, 65, 68, 70, 71, 78, 83, 95, 98, 108, ...].
hercules_graph <- graph_from_counts(hercules_counts)
## Warning: Expected 2 pieces. Additional pieces discarded in 158 rows [9, 15,
## 22, 64, 65, 71, 72, 92, 103, 147, 168, 181, 182, 185, 193, 197, 199, 200, 201,
## 202, ...].
aeneid_graph <- graph_from_counts(aeneid_counts)
## Warning: Expected 2 pieces. Additional pieces discarded in 3209 rows [14, 15,
## 19, 27, 31, 42, 44, 48, 54, 55, 57, 71, 86, 91, 92, 93, 99, 104, 106, 125, ...].
plot_graph(gilgamesh_graph)
## Warning: Ignoring unknown parameters: check_overlap

plot_graph(ramayana_graph)
## Warning: Ignoring unknown parameters: check_overlap
## Warning: ggrepel: 144 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

plot_graph(hercules_graph)
## Warning: Ignoring unknown parameters: check_overlap

plot_graph(aeneid_graph)
## Warning: Ignoring unknown parameters: check_overlap

# Retokenizes raw text by sentence level rather than by bigrams
gilgamesh_sentences <- text_to_sentence(gilgamesh_text)
ramayana_sentences <- text_to_sentence(ramayana_text)
hercules_sentences <- text_to_sentence(hercules_text)
aeneid_sentences <- text_to_sentence(aeneid_text)
# Custom dictionary of thematic keywords that will dictate which sentences will be analyzed
thematic_keywords = c("magic","priest","sorcerer","magician","shaman","wizard","life","death","rebirth","resurrection",
"heaven","hell","gift","dream","water","fire","earth","wind","air","marriage","king","soul","feast","man","woman","god","godess")
# Isolates thematic sentences according to thematic_keywords dictionary.
# Setting the second parameter pull_nearby to TRUE indicates that we will pull
# the sentences containing thematic keywords, as well as sentences occurring within 10 lines.
gilgamesh_thematic_sentences <- isolate_thematic_sentences(gilgamesh_sentences, TRUE)
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): longer object length is not a multiple of shorter object length
ramayana_thematic_sentences <- isolate_thematic_sentences(ramayana_sentences, TRUE)
hercules_thematic_sentences <- isolate_thematic_sentences(hercules_sentences, TRUE)
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): longer object length is not a multiple of shorter object length
aeneid_thematic_sentences <- isolate_thematic_sentences(aeneid_sentences, TRUE)
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): longer object length is not a multiple of shorter object length
# Retokenizes our isolated thematic sentences and surrounding lines by bigram.
gilgamesh_thematic_byword <- bigrams_by_sentence(gilgamesh_thematic_sentences)
ramayana_thematic_byword <- bigrams_by_sentence(ramayana_thematic_sentences)
hercules_thematic_byword <- bigrams_by_sentence(hercules_thematic_sentences)
aeneid_thematic_byword <- bigrams_by_sentence(aeneid_thematic_sentences)
# Counts the thematic bigrams, removing stop words first.
gilgamesh_thematic_counts<- count_thematic_bigrams(gilgamesh_thematic_byword)
ramayana_thematic_counts <- count_thematic_bigrams(ramayana_thematic_byword)
hercules_thematic_counts <- count_thematic_bigrams(hercules_thematic_byword)
aeneid_thematic_counts <- count_thematic_bigrams(aeneid_thematic_byword)
library(igraph)
# Creates graph object for new counts
gilgamesh_thematic_graph <- graph_from_counts(gilgamesh_thematic_counts, 1)
## Warning: Expected 2 pieces. Additional pieces discarded in 2 rows [510, 679].
ramayana_thematic_graph <- graph_from_counts(ramayana_thematic_counts, 5)
## Warning: Expected 2 pieces. Additional pieces discarded in 121 rows [42, 44, 53,
## 57, 80, 104, 111, 184, 223, 249, 267, 462, 525, 564, 616, 625, 639, 651, 652,
## 653, ...].
hercules_thematic_graph <- graph_from_counts(hercules_thematic_counts, 1)
## Warning: Expected 2 pieces. Additional pieces discarded in 8 rows [136, 141,
## 151, 173, 218, 246, 372, 373].
aeneid_thematic_graph <- graph_from_counts(aeneid_thematic_counts, 1)
## Warning: Expected 2 pieces. Additional pieces discarded in 136 rows [4, 5, 13,
## 43, 51, 62, 64, 68, 72, 83, 84, 85, 92, 95, 97, 105, 106, 111, 112, 116, ...].
library(ggraph)
plot_graph(gilgamesh_thematic_graph)
## Warning: Ignoring unknown parameters: check_overlap

plot_graph(ramayana_thematic_graph)
## Warning: Ignoring unknown parameters: check_overlap

plot_graph(hercules_thematic_graph)
## Warning: Ignoring unknown parameters: check_overlap

plot_graph(aeneid_thematic_graph)
## Warning: Ignoring unknown parameters: check_overlap
